# %matplotlib widget
%matplotlib inline
import os
import io
import re
import pickle
import base64
import gridfs
import pymongo
import numpy as np
import pandas as pd
from PIL import Image
from pprint import pprint
from pymongo import MongoClient
import matplotlib.pyplot as plt
import mpl_toolkits.axes_grid1
from IPython.display import HTML
from collections import OrderedDict
import matplotlib.widgets as widgets
import matplotlib.animation as animation
from matplotlib.animation import FuncAnimation
pd.set_option('display.max_columns', None) # or 1000
pd.set_option('display.max_rows', None) # or 1000
# Conenct to the client
client = MongoClient('localhost', 27017)
print('databases:', client.list_database_names())
db = client['MY_DB']
print('Collections of {} db: {}'.format(db.name, db.list_collection_names()))
def slice_dict(d, keys):
""" Returns a dictionary ordered and sliced by given keys
keys can be a list, or a CSV string
"""
if isinstance(keys, str):
keys = keys[:-1] if keys[-1] == ',' else keys
keys = re.split(', |[, ]', keys)
return dict((k, d[k]) for k in keys)
def sacred_to_df(db_runs, mongo_query=None, ):
"""
db_runs is usually db.runs
returns a dataframe that summarizes the experiments, where
config and info fields are flattened to their keys.
Summary DF contains the following columns:
_id, experiment.name, **config, result, **info, status, start_time
"""
# get all experiment according to mongo query and represent as pandas DataFrame
df = pd.DataFrame(list(db_runs.find(mongo_query)))
# Take only the interesting columns
df = df.loc[:, '_id, experiment, config, result, info, status, start_time'.split(', ')]
def _summerize_experiment(s):
"""
Take only the
"""
o = OrderedDict()
o['_id'] = s['_id']
o['name'] = s['experiment']['name']
o.update(s['config'])
for key, val in s['info'].items():
if key != 'metrics':
o[key] = val
o.update(slice_dict(s.to_dict(), 'info, status'))
return pd.Series(o)
sum_list = []
for ix, s in df.iterrows():
sum_list.append(_summerize_experiment(s))
df_summary = pd.DataFrame(sum_list).set_index('_id')
return df_summary
# Get the COMPLETED experiments
query = "status == 'COMPLETED'"
df_summary = sacred_to_df(db.runs)
# Fix column names
df_summary.columns = df_summary.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df_summary.columns = df_summary.columns.str.replace('.', '_')
# Now execute the query
df_summary.query(query)
# Sort them in ascending order based on validation loss (best performer is first).
df_summary = df_summary.sort_values('best_valid_loss', ascending=True)
display(df_summary.head())
def query_by_df(mongo_db_runs, df=None, ids=None):
"""
Get raw experiments, according to the summary dataframe, or list of ids
"""
if ids is None:
ids = df.index.tolist()
# Get a mongo iterator, according to the pandas DataFrame.
mongo_cursor = mongo_db_runs.find(dict(_id={'$in':ids}))
# put raw results to a datafram
df_raw_results = pd.DataFrame(list(mongo_cursor)).set_index('_id')
# reorder results according to given dataframe because Cursor iterator does not preserve order from given dataframe
df_raw_results = df_raw_results.reindex(ids)
return df_raw_results
df_raw = query_by_df(db.runs, df_summary)
df_raw.head()
from bson.objectid import ObjectId
# For disp. imgs in DataFrame
pd.set_option('display.max_colwidth', -1)
def get_thumbnail(path):
i = Image.open(path)
i.thumbnail((150, 150), Image.LANCZOS)
return i
def image_base64(im):
if isinstance(im, str):
im = get_thumbnail(im)
with io.BytesIO() as buffer:
im.save(buffer, 'png')
return base64.b64encode(buffer.getvalue()).decode()
def fig2pngbuf(fig):
# buffer = StringIO.StringIO()
buffer = io.BytesIO()
# canvas = plt.get_current_fig_manager().canvas
fig.canvas.draw()
w, h = fig.canvas.get_width_height()
buffer.shape = (w, h)
image = Image.frombytes(
'RGB',
(w, h),
fig.canvas.tostring_rgb()
)
image.save(buffer, 'PNG')
return buffer
def fig2data(fig):
"""
@brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
@param fig a matplotlib figure
@return a numpy 3D array of RGBA values
"""
# draw the renderer
fig.canvas.draw()
# Get the RGBA buffer from the figure
w, h = fig.canvas.get_width_height()
buf = np.frombuffer(fig.canvas.tostring_argb(), dtype=np.uint8)
buf.shape = (w, h, 4)
# canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
buf = np.roll(buf, 3, axis=2)
return buf
def fig2img(fig):
"""
@brief Convert a Matplotlib figure to a PIL Image in RGBA format and return it
@param fig a matplotlib figure
@return a Python Imaging Library ( PIL ) image
"""
# put the figure pixmap into a numpy array
# buf = fig2data(fig)
buf = fig2pngbuf(fig)
# w, h, d = buf.shape
# return Image.frombytes("RGBA", (w, h), buf.tostring())
return Image.open(buf)
def image_formatter(im):
return f'<img src="data:image/png;base64,{image_base64(im)}">'
def metrics_to_name_id_dict(list_metrics):
""" Converts a list of artifacts to a dictionary of {filename:file_id} """
d = {}
for metric in list_metrics:
metric_name = metric['name']
if metric_name in d.keys():
raise(RuntimeError('%s metric has duplicates' % metric_name))
d[metric_name] = ObjectId(metric['id'])
return d
def experiment_metrics(db, exp_id):
ex = db.runs.find_one(dict(_id=exp_id))
metrics_id = metrics_to_name_id_dict(ex['info']['metrics'])
metrics = {}
for metric, id in metrics_id.items():
metrics[metric] = db.metrics.find_one(dict(_id=id))
return metrics
def plot_experiment_metrics(db, exp_id, metrics_to_plot=None, rename_metrics={}):
metrics = experiment_metrics(db, exp_id)
if metrics_to_plot is None:
metrics_to_plot = sorted(list(metrics.keys()))
legend = []
for name in metrics_to_plot:
plt.plot(metrics[name]['steps'], metrics[name]['values'])
# rename the metric if a new name is given
legend.append(rename_metrics.get(name, name))
return legend
def summarize_metrics(df_summarized):
d_metrics = {
('Accuracy','train'): [],
('Accuracy','valid'): [],
('Accuracy','test'): [],
('Loss','train'): [],
('Loss','valid'): []
}
d_plots = {
'Model': [],
'Train_Metrics': [],
'Valid_Metrics': [],
'Test_Metrics': []
}
exper_names = []
for idx, name in enumerate(df_summarized['name']):
exper_names.append(name)
d_plots['Model'].append(name)
metrics = experiment_metrics(db, int(df_summarized.index[idx]))
d_metrics[('Accuracy','train')].append(np.max(metrics['training.acc']['values']))
d_metrics[('Accuracy','valid')].append(np.max(metrics['validation.acc']['values']))
d_metrics[('Accuracy','test')].append(np.max(metrics['test.accuracy']['values']))
d_metrics[('Loss','train')].append(np.min(metrics['training.loss']['values']))
d_metrics[('Loss','valid')].append(np.min(metrics['validation.loss']['values']))
for mode in ['train', 'valid', 'test']:
metric_name = 'training' if mode is 'train' else 'validation'
plt.ioff()
fig = plt.figure()
if mode is 'test':
legend = plot_experiment_metrics(
db,
int(df_summary.index[idx]),
['test.accuracy'],
rename_metrics={'test.accuracy': 'Test Accuracy'}
)
else:
legend = plot_experiment_metrics(
db,
int(df_summarized.index[idx]),
[metric_name + '.acc', metric_name + '.loss', metric_name + '.error'],
rename_metrics={metric_name + '.acc': 'Train Accuracy',
metric_name + '.loss': 'Training Loss',
metric_name + '.error': 'MSE on labels'}
)
plt.legend(legend);
plt.xlabel('Epoch')
plt.ylabel('Values')
plt.title(name)
plt.grid(True)
d_plots[mode.capitalize() + '_Metrics'].append(fig2img(fig))
plt.close('all')
df_metrics = pd.DataFrame(d_metrics, index=exper_names)
df_plots = pd.DataFrame(d_plots)
return df_metrics, df_plots
metrics, plots = summarize_metrics(df_summary)
display(metrics)
# Show the results for each model and dataset combination
HTML(plots[['Model', 'Train_Metrics', 'Valid_Metrics', 'Test_Metrics']].to_html(formatters={'Train_Metrics': image_formatter, 'Valid_Metrics': image_formatter, 'Test_Metrics': image_formatter}, escape=False))
# For expers and imgs in DB
fs = gridfs.GridFS(db)
def exp_artifacts_to_dict(list_artifacts):
""" Converts a list of artifacts to a dictionary of {filename:file_id} """
d = {}
for ar in list_artifacts:
if ar['name'] in d.keys():
d[ar['name']].append(ar['file_id'])
else:
d[ar['name']] = []
d[ar['name']].append(ar['file_id'])
return d
exper = {'Model': [], 'Before': [], 'After': []}
for idx, ex in enumerate(df_raw.experiment):
gfs_artifacts = exp_artifacts_to_dict(df_raw.artifacts.iloc[idx])
imgs_bytes_before = [fs.get(imgObj).read() for imgObj in gfs_artifacts['reliab.png']]
imgs_bytes_after = [fs.get(imgObj).read() for imgObj in gfs_artifacts['reliab_calib.png']]
imgs_before = [Image.open(io.BytesIO(img)) for img in imgs_bytes_before]
imgs_after = [Image.open(io.BytesIO(img)) for img in imgs_bytes_after]
exper['Model'].append(ex['name'])
exper['Before'].append(imgs_before)
exper['After'].append(imgs_after)
exper['Before'] = sum(exper['Before'], []) # flatten nested lists
exper['After'] = sum(exper['After'], [])
df_calibrated = pd.DataFrame(exper)
# Show the results for each model and dataset combination
HTML(df_calibrated[['Model', 'Before', 'After']].to_html(formatters={'Before': image_formatter, 'After': image_formatter}, escape=False))